plugin=True
plugin_dir='projects/mmdet3d_plugin/'

point_cloud_range_cmt = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
point_cloud_range_streampetr = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
class_names = [
    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
voxel_size_cmt = [0.1, 0.1, 0.2]
voxel_size_streampetr = [0.2, 0.2, 8]
out_size_factor = 8
evaluation = dict(interval=20)
dataset_type = 'CustomNuScenesDataset'
data_root = 'data/nuscenes/'
input_modality_cmt = dict(
    use_lidar=True,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False)
input_modality_streampetr = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=True)
collect_keys_streampetr = ['lidar2img', 'intrinsics', 'extrinsics','timestamp', 'img_timestamp', 'ego_pose', 'ego_pose_inv']

img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False)
    
ida_aug_conf = {
        "resize_lim": (0.47, 0.625),
        "final_dim": (320, 800),
        "bot_pct_lim": (0.0, 0.0),
        "rot_lim": (0.0, 0.0),
        "H": 900,
        "W": 1600,
        "rand_flip": True,
    }

test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=[0, 1, 2, 3, 4],
    ),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        use_dim=[0, 1, 2, 3, 4],
    ),
    dict(type='LoadMultiViewImageFromFiles'),
    dict(
        type='ModalMask3DAny',
        mask_cameras=[],
        mask_lidar=False,
    ),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1.0, 1.0],
                translation_std=[0, 0, 0]),
            dict(type='RandomFlip3D'),
            dict(type='ResizeCropFlipImage', data_aug_conf = ida_aug_conf, training=False),
            dict(type='NormalizeMultiviewImage', **img_norm_cfg),
            dict(type='PadMultiViewImage', size_divisor=32),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points', 'img'])
        ])
]
data = dict(
    samples_per_gpu=4, # 4 * 4 = 16 samples per GPU
    workers_per_gpu=4,
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + '/nuscenes_infos_val.pkl',
        load_interval=1,
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality_cmt,
        test_mode=True,
        box_type_3d='LiDAR'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + '/nuscenes_infos_val.pkl',
        load_interval=1,
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality_cmt,
        test_mode=True,
        box_type_3d='LiDAR'))
num_frame_losses = 1
model = dict(
    type='SceneDetectorV2',
    camera_lidar_detector=dict(
        type='CmtPipelineDetector',
        detector=dict(
            type='CmtDetectorWithTopPModalRouter',
            top_p_threshold=0.9,
            loss_router_dynamic_weight=1e-4,
            loss_router_balance_weight=1e-2,
            use_grid_mask=True,
            masked_img_path=None,
            always_use_lidar=True,
            img_backbone=dict(
                type='ResNet',
                depth=50,
                num_stages=4,
                out_indices=(2, 3),
                frozen_stages=-1,
                norm_cfg=dict(type='BN', requires_grad=True),
                norm_eval=True,
                with_cp=True,
                style='pytorch'),
            img_neck=dict(
                type='CPFPN',
                in_channels=[1024, 2048],
                out_channels=256,
                num_outs=2),
            pts_voxel_layer=dict(
                num_point_features=5,
                max_num_points=10,
                voxel_size=voxel_size_cmt,
                max_voxels=(120000, 160000),
                point_cloud_range=point_cloud_range_cmt),
            pts_voxel_encoder=dict(
                type='HardSimpleVFE',
                num_features=5,
            ),
            pts_middle_encoder=dict(
                type='SparseEncoder',
                in_channels=5,
                sparse_shape=[41, 1024, 1024],
                output_channels=128,
                order=('conv', 'norm', 'act'),
                encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)),
                encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
                block_type='basicblock'),
            pts_backbone=dict(
                type='SECOND',
                in_channels=256,
                out_channels=[128, 256],
                layer_nums=[5, 5],
                layer_strides=[1, 2],
                norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
                conv_cfg=dict(type='Conv2d', bias=False)),
            pts_neck=dict(
                type='SECONDFPN',
                in_channels=[128, 256],
                out_channels=[256, 256],
                upsample_strides=[1, 2],
                norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
                upsample_cfg=dict(type='deconv', bias=False),
                use_conv_for_no_stride=True),
            pts_bbox_head=dict(
                type='CmtHead',
                in_channels=512,
                hidden_dim=256,
                downsample_scale=8,
                common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
                tasks=[
                    dict(num_class=10, class_names=[
                        'car', 'truck', 'construction_vehicle',
                        'bus', 'trailer', 'barrier',
                        'motorcycle', 'bicycle',
                        'pedestrian', 'traffic_cone'
                    ]),
                ],
                bbox_coder=dict(
                    type='MultiTaskBBoxCoder',
                    post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
                    pc_range=point_cloud_range_cmt,
                    max_num=300,
                    voxel_size=voxel_size_cmt,
                    num_classes=10), 
                separate_head=dict(
                    type='SeparateTaskHead', init_bias=-2.19, final_kernel=1),
                transformer=dict(
                    type='CmtTransformer',
                    decoder=dict(
                        type='PETRTransformerDecoder',
                        return_intermediate=True,
                        num_layers=6,
                        transformerlayers=dict(
                            type='PETRTransformerDecoderLayer',
                            with_cp=False,
                            attn_cfgs=[
                                dict(
                                    type='MultiheadAttention',
                                    embed_dims=256,
                                    num_heads=8,
                                    dropout=0.1),
                                dict(
                                    type='PETRMultiheadFlashAttention',
                                    embed_dims=256,
                                    num_heads=8,
                                    dropout=0.1),
                                ],
                            ffn_cfgs=dict(
                                type='FFN',
                                embed_dims=256,
                                feedforward_channels=1024,
                                num_fcs=2,
                                ffn_drop=0.,
                                act_cfg=dict(type='ReLU', inplace=True),
                            ),

                            feedforward_channels=1024, #unused
                            operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                            'ffn', 'norm')),
                    )),
                loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0),
                loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
                loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
            ),
            train_cfg=dict(
                pts=dict(
                    dataset='nuScenes',
                    assigner=dict(
                        type='HungarianAssigner3D',
                        # cls_cost=dict(type='ClassificationCost', weight=2.0),
                        cls_cost=dict(type='FocalLossCost', weight=2.0),
                        reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
                        iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
                        pc_range=point_cloud_range_cmt,
                        code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
                    ),
                    pos_weight=-1,
                    gaussian_overlap=0.1,
                    min_radius=2,
                    grid_size=[1024, 1024, 40],  # [x_len, y_len, 1]
                    voxel_size=voxel_size_cmt,
                    out_size_factor=out_size_factor,
                    code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
                    point_cloud_range=point_cloud_range_cmt)),
            test_cfg=dict(
                pts=dict(
                    dataset='nuScenes',
                    grid_size=[1024, 1024, 40],
                    out_size_factor=out_size_factor,
                    pc_range=point_cloud_range_cmt,
                    voxel_size=voxel_size_cmt,
                    nms_type=None,
                    nms_thr=0.2,
                    use_rotate_nms=True,
                    max_num=200
                ))),
        img_pipeline=[
            dict(
                type='MultiScaleFlipAug3D',
                img_scale=(1333, 800),
                pts_scale_ratio=1,
                flip=False,
                transforms=[
                    dict(
                        type='GlobalRotScaleTransWithoutPoints',
                        rot_range=[0, 0],
                        scale_ratio_range=[1.0, 1.0],
                        translation_std=[0, 0, 0]),
                    dict(type='RandomFlip', flip_ratio=0.0),
                    dict(type='ResizeCropFlipImage', data_aug_conf = ida_aug_conf, training=False),
                    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
                    dict(type='PadMultiViewImage', size_divisor=32),
                    dict(
                        type='DefaultFormatBundle3D',
                        class_names=class_names,
                        with_label=False),
                    dict(type='Collect3D', keys=['img'],
                        meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
                                    'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
                                    'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
                                    'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx',
                                    'pcd_scale_factor', 'pcd_rotation', 'pcd_rotation_angle',
                                    'pts_filename', 'transformation_3d_flow', 'trans_mat',
                                    'affine_aug'))
                ])
        ],
        pts_pipeline=[
            dict(
                type='MultiScaleFlipAug3D',
                img_scale=(1333, 800),
                pts_scale_ratio=1,
                flip=False,
                transforms=[
                    dict(
                        type='GlobalRotScaleTrans',
                        rot_range=[0, 0],
                        scale_ratio_range=[1.0, 1.0],
                        translation_std=[0, 0, 0]),
                    dict(type='RandomFlip3DPoints'),
                    dict(
                        type='DefaultFormatBundle3D',
                        class_names=class_names,
                        with_label=False),
                    dict(type='Collect3D', keys=['points'],
                        meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
                                    'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
                                    'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
                                    'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx',
                                    'pcd_scale_factor', 'pcd_rotation', 'pcd_rotation_angle',
                                    'pts_filename', 'transformation_3d_flow', 'trans_mat',
                                    'affine_aug'))
                ])
        ],
        dataset=dict(
            type='CustomNuScenesDataset',
            data_root=data_root,
            ann_file=data_root + '/nuscenes_infos_val.pkl',
            load_interval=1,
            pipeline=[
                dict(type='LoadMultiViewImageFromFiles'),
                dict(
                    type='LoadPointsFromFile',
                    coord_type='LIDAR',
                    load_dim=5,
                    use_dim=[0, 1, 2, 3, 4]),
                dict(
                    type='LoadPointsFromMultiSweeps',
                    sweeps_num=10,
                    use_dim=[0, 1, 2, 3, 4])],
            classes=class_names,
            modality=input_modality_cmt,
            test_mode=True,
            box_type_3d='LiDAR')),
    camera_only_detector=dict(
        type='RepDetr3DPipelineDetector',
        detector=dict(
            type='RepDetr3DWithTopPModalRouter',
            top_p_threshold=0.90,
            loss_router_dynamic_weight=0.01,
            loss_router_balance_weight=0.1,
            router_only=True,
            num_frame_head_grads=num_frame_losses,
            num_frame_backbone_grads=num_frame_losses,
            num_frame_losses=num_frame_losses,
            use_grid_mask=True,
            stride=[8, 16, 32, 64],
            position_level=[0, 1, 2, 3],
            img_backbone=dict(
                type='VoVNetCP', ###use checkpoint to save memory
                spec_name='V-99-eSE',
                norm_eval=True,
                frozen_stages=-1,
                input_ch=3,
                out_features=('stage2','stage3','stage4','stage5',)),
            img_neck=dict(
                type='FPN',  ###remove unused parameters 
                start_level=1,
                add_extra_convs='on_output',
                relu_before_extra_convs=True,
                in_channels=[256, 512, 768, 1024],
                out_channels=256,
                num_outs=4),
            img_roi_head=dict(
                type='YOLOXHeadCustom',
                num_classes=10,
                in_channels=256,
                strides=[8, 16, 32, 64],
                train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
                test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)),
                ),
            pts_bbox_head=dict(
                type='SparseHead',
                num_classes=10,
                in_channels=256,
                num_query=644,
                memory_len=1024,
                topk_proposals=256,
                num_propagated=256,
                scalar=10, ##noise groups
                noise_scale = 1.0, 
                dn_weight= 1.0, ##dn loss weight
                split = 0.75, ###positive rate
                with_dn=True,
                with_ego_pos=True,
                match_with_velo=False,
                code_weights = [2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                transformer=dict(
                    type='Detr3DTransformer',
                    decoder=dict(
                        type='Detr3DTransformerDecoder',
                        embed_dims=256,
                        num_layers=6,
                        transformerlayers=dict(
                            type='Detr3DTemporalDecoderLayer',
                            batch_first=True,
                            attn_cfgs=[
                                dict(
                                    type='MultiheadAttention',
                                    embed_dims=256,
                                    num_heads=8,
                                    dropout=0.1),
                                dict(
                                    type='DeformableFeatureAggregationCuda', 
                                    embed_dims=256,
                                    num_groups=8,
                                    num_levels=4,
                                    num_cams=6,
                                    dropout=0.1,
                                    num_pts=13,
                                    bias=2.),
                                ],
                            feedforward_channels=2048,
                            ffn_dropout=0.1,
                            with_cp=True,  ###use checkpoint to save memory
                            operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                            'ffn', 'norm')),
                    )),
                bbox_coder=dict(
                    type='NMSFreeCoder',
                    post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
                    pc_range=point_cloud_range_streampetr,
                    max_num=300,
                    voxel_size=voxel_size_streampetr,
                    num_classes=10), 
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
                loss_bbox=dict(type='L1Loss', loss_weight=0.25),
                loss_iou=dict(type='GIoULoss', loss_weight=0.0),),
            # model training and testing settings
            train_cfg=dict(pts=dict(
                grid_size=[512, 512, 1],
                voxel_size=voxel_size_streampetr,
                point_cloud_range=point_cloud_range_streampetr,
                out_size_factor=4,
                assigner=dict(
                    type='HungarianAssigner3D',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
                    iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
                    pc_range=point_cloud_range_streampetr)))),
        img_pipeline=[
            dict(type='ResizeCropFlipRotImage', data_aug_conf = ida_aug_conf, training=False),
            dict(type='NormalizeMultiviewImage', **img_norm_cfg),
            dict(type='PadMultiViewImage', size_divisor=32),
            dict(
                type='MultiScaleFlipAug3D',
                img_scale=(1333, 800),
                pts_scale_ratio=1,
                flip=False,
                transforms=[
                    dict(
                        type='PETRFormatBundle3D',
                        collect_keys=collect_keys_streampetr,
                        class_names=class_names,
                        with_label=False),
                    dict(type='Collect3D', keys=['img'] + collect_keys_streampetr,
                    meta_keys=('filename', 'ori_shape', 'img_shape','pad_shape', 'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'scene_token', 'sample_idx'))
                ])
        ],
        pts_pipeline=None,
        dataset=dict(
            type='StreamPETRNuScenesDataset',
            pipeline=[
                dict(type='LoadMultiViewImageFromFiles', to_float32=True)],
            collect_keys=collect_keys_streampetr + ['img', 'img_metas'],
            queue_length=1,
            ann_file=data_root + 'nuscenes2d_temporal_infos_val.pkl',
            classes=class_names,
            modality=input_modality_streampetr,
        )),
    scene_classifier=dict(
        type='VisionTransformerPipelineDetector',
        detector=dict(
            type='VisionTransformer',
            patch_size=16,
            embed_dim=192,
            depth=3,
            num_heads=3,
            multi_cls_tokens=True,
            mlp_head=False,
            num_classes=2,
            num_classifier=7,
            num_images=1),
        img_pipeline=None,
        pts_pipeline=None,
        dataset=dict(
            type='SceneClassifierNuScenesDataset',
            data_root=data_root,
            info_file=data_root + '/nuscenes_infos_val.pkl',
            image_size=224,
            used_cams=['CAM_FRONT'])))

optimizer = dict(
    type='AdamW',
    lr=0.00010, # original: 0.00014
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.01, decay_mult=5),
            'img_neck': dict(lr_mult=0.1),
        }),
    weight_decay=0.01)  # for 8gpu * 2sample_per_gpu
optimizer_config = dict(
    type='CustomFp16OptimizerHook',
    loss_scale='dynamic',
    grad_clip=dict(max_norm=35, norm_type=2),
    custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False))

lr_config = dict(
    policy='cyclic',
    target_ratio=(6, 0.0001),
    cyclic_times=1,
    step_ratio_up=0.4)
momentum_config = dict(
    policy='cyclic',
    target_ratio=(0.8947368421052632, 1),
    cyclic_times=1,
    step_ratio_up=0.4)
total_epochs = 20
checkpoint_config = dict(interval=1)
log_config = dict(
    interval=50,
    hooks=[dict(type='TextLoggerHook'),
           dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from='checkpoints/nuim_r50.pth'
resume_from = None
workflow = [('train', 1)]
gpu_ids = range(0, 8)
