voxel_size: [0.075, 0.075, 0.2]
point_cloud_range: [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]

model:
  encoders:
    camera:
      backbone:
        type: SwinTransformer
        embed_dims: 96
        depths: [2, 2, 6, 2]
        num_heads: [3, 6, 12, 24]
        window_size: 7
        mlp_ratio: 4
        qkv_bias: true
        qk_scale: null
        drop_rate: 0.
        attn_drop_rate: 0.
        drop_path_rate: 0.2
        patch_norm: true
        out_indices: [1, 2, 3]
        with_cp: false
        convert_weights: true
        init_cfg:
          type: Pretrained
          checkpoint: https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
      neck:
        in_channels: [192, 384, 768]
      vtransform:
        xbound: [-54.0, 54.0, 0.3]
        ybound: [-54.0, 54.0, 0.3]
    lidar:
      voxelize:
        point_cloud_range: ${point_cloud_range}
        voxel_size: ${voxel_size}
        max_voxels: [120000, 160000]
      backbone:
        sparse_shape: [1440, 1440, 41]

  heads:
    object:
      train_cfg:
        grid_size: [1440, 1440, 41]
      test_cfg:
        grid_size: [1440, 1440, 41]

lr_config:
  policy: CosineAnnealing
  warmup: linear
  warmup_iters: 500
  warmup_ratio: 0.33333333
  min_lr_ratio: 1.0e-3
