model:
  encoders:
    camera:
      neck:
        type: GeneralizedLSSFPN
        in_channels: [512, 1024, 2048]
        out_channels: 256
        start_level: 0
        num_outs: 3
        norm_cfg:
          type: BN2d
          requires_grad: true
        act_cfg:
          type: ReLU
          inplace: true
        upsample_cfg:
          mode: bilinear
          align_corners: false
      vtransform:
        type: DepthLSSTransform
        in_channels: 256
        out_channels: 80
        image_size: ${image_size}
        feature_size: ${[image_size[0] // 8, image_size[1] // 8]}
        xbound: [-51.2, 51.2, 0.4]
        ybound: [-51.2, 51.2, 0.4]
        zbound: [-10.0, 10.0, 20.0]
        dbound: [1.0, 60.0, 0.5]
        downsample: 2
    lidar:
      voxelize:
        max_num_points: 10
        point_cloud_range: ${point_cloud_range}
        voxel_size: ${voxel_size}
        max_voxels: [90000, 120000]
      backbone:
        type: SparseEncoder
        in_channels: 5
        sparse_shape: [1024, 1024, 41]
        output_channels: 128
        order:
          - conv
          - norm
          - act
        encoder_channels:
          - [16, 16, 32]
          - [32, 32, 64]
          - [64, 64, 128]
          - [128, 128]
        encoder_paddings:
          - [0, 0, 1]
          - [0, 0, 1]
          - [0, 0, [1, 1, 0]]
          - [0, 0]
        block_type: basicblock

lr_config: null

optimizer:
  lr: 2.0e-4

max_epochs: 6
